In [1]:
from utils import Activities, Users

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"
import plotly.figure_factory as ff

import numpy as np

from config_vars import month_map, month_order
In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None
In [3]:
data_folder = "../data/"

#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)
In [ ]:
 

Data Loading and Cleaning

In [4]:
users = Users(f"{data_folder}users.csv")
students = users.df.query("student == 1")
supervisors = users.df.query("supervisor == 1")
teachers  = users.df.query("teacher == 1")


activities = Activities(f"{data_folder}activities.csv")

activities.drop_over_year() # TOASK: a Nicola utenti oltre l'anno
activities.map_month(month_map,month_order) # map and sort months
activities.save_raw() # create a copy of non cleaned dataframe
activities.fillna() # fill some columns with 0 instead of NaN
activities.year_to_cat()


feedbacks = pd.read_csv(f"{data_folder}activities_feedbacks_info.csv")
feedbacks = feedbacks.loc[feedbacks['ac_activity'].isin(activities.df['ac_activity']),:]

df_months = pd.read_csv(f"{data_folder}months.csv").fillna(0)
df_months.replace({'month':month_map},inplace=True)
df_months['month_order'] = df_months['month'].map(dict(zip(month_order,range(1,13))))
df_months.sort_values(by=['month_order','activity_school_year'],inplace=True)

y_users = df_months['n_users_per_year']
df_months['norm_avg_n_user_recipes']        = df_months['n_recipes'].divide(y_users)
df_months['norm_avg_n_user_experiences']    = df_months['n_experiences'].divide(y_users)
df_months['norm_avg_n_activities']          = df_months['n_activities'].divide(y_users)

logins = pd.read_csv(f"{data_folder}log_logins.csv")

Data Exploring

Overview:

In [5]:
n_supervisors = len(supervisors)
n_students    = len(students)
n_teachers    = len(users.df.query("teacher == 1"))

print(f"There are {n_students} students, {n_supervisors} supervisors, and {n_teachers} teachers")
There are 544 students, 226 supervisors, and 21 teachers
In [6]:
activities.plot_dashboard()

Number of activities

In [7]:
hist_data = activities.df.groupby(by=['creation_month','activity_school_year','start_year'],sort=False).size().reset_index(name="count")
hist_data = activities.sort_month(hist_data,'creation_month',['activity_school_year','start_year'])

fig = hist_data.plot(x="creation_month", y=["count"], facet_col='activity_school_year', color="start_year")
fig.update_layout(
    title="Number of activities per start year",
    title_x=0.5,
    yaxis_title="# of activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Number of activities per student

In [8]:
fig = students.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_activities", color="start_year",
    hover_data =["us_user","n_activities","start_year"]
)

fig.update_layout(
    title="Number of activities per student per start year",
    title_x=0.5,
    yaxis_title="Activities per student",
)

fig.update_yaxes(range=[0, 600])
fig.show()

Number of active users

In [9]:
hist_data = activities.df.groupby(by=['activity_school_year','start_year','us_user'],sort=False).size().reset_index(name="count")
active_users_per_year = hist_data.query("count > 0").drop("us_user",axis=1).groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count").sort_values(by=["start_year","activity_school_year"])
In [10]:
fig = active_users_per_year.plot(x="start_year", y="count", facet_col='activity_school_year')
fig.update_layout(
    title="Number of active users per year",
    title_x=0.5,
    yaxis_title="# of actitve users",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45, type="category")
fig.update_traces(line_traces)

fig.show()

Number of feedbacks

In [11]:
hist_data = feedbacks.groupby(by=['activity_school_year','start_year'],sort=False).size().reset_index(name="count")
In [12]:
fig = hist_data.plot(x="start_year", y=["count"], facet_col='activity_school_year')
fig.update_layout(
    title="Number of feedbacks per start year",
    title_x=0.5,
    yaxis_title="# of feedbacks",
    showlegend=False,
)
fig.update_xaxes(title=dict(text=""),tickangle=45,type="category")
fig.update_traces(line_traces)

fig.show()
In [13]:
feedbacks['response_year'] = pd.DatetimeIndex(feedbacks['response_date']).year
data_hist = feedbacks.groupby(by=['recipient','response_year']).size().reset_index(name="count")

fig = data_hist.sort_values(by=["response_year"]).plot.box(
    x="response_year",y="count", color="response_year", 
    hover_data =["recipient","response_year","count"]
)

fig.update_layout(
    title="Number of feedbacks per supervisor",
    title_x=0.5,
    yaxis_title="Feedbacks per supervisor",
)

fig.show()

Number of feedback responses per supervisor

In [14]:
fig = supervisors.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_feedback_responses", color="start_year",
    hover_data =["us_user","user_name","n_feedback_responses","start_year"]
)

fig.update_layout(
    title="Number of feedbacks per supervisor per start year",
    title_x=0.5,
    yaxis_title="Feedbacks per supervisor",
)

fig.show()

Number of feedback requests per student

In [15]:
fig = students.sort_values(by=["start_year"]).plot.box(
    x="start_year",y="n_feedback_requests", color="start_year",
    hover_data =["us_user","user_name","n_feedback_requests","start_year"]
)

fig.update_layout(
    title="Number of feedback requests per student",
    title_x=0.5,
    yaxis_title="Requests per student",
)

fig.show()

Activities images

In [16]:
fig = activities.df.sort_values(by=["start_year","activity_school_year"]).plot.box(
    x="activity_school_year",y="n_images", 
    color="start_year", 
    hover_data =["ac_activity","ac_title","activity_total_length","start_year"]
)

fig.update_layout(
    title="Number of images per activity",
    title_x=0.5,
)

fig.show()

Activities total lenght

In [17]:
fig = activities.df.sort_values(by=["start_year","activity_school_year"]).plot.box(
    x="activity_school_year",y="activity_total_length", 
    color="start_year", 
    hover_data =["ac_activity","ac_title","activity_total_length","start_year"]
)

fig.update_layout(
    title="Total lenghts",
    title_x=0.5,
)

fig.update_yaxes(range=[0, 600])
fig.show()

Months trends:

Activities per month

Creation

In [18]:
fig = df_months.plot(x="month", y=["n_activities"],facet_col='activity_school_year')
fig.update_layout(
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="# activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [19]:
fig = df_months.plot(x="month", y=["norm_avg_n_activities"],facet_col='activity_school_year')
fig.update_layout(
    title="Number of activities per users per month",
    title_x=0.5,
    yaxis_title="average # of activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Edits

In [20]:
fig = df_months.plot(x="month", y=["n_edits"],facet_col='activity_school_year')
fig.update_layout(
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="# activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Curriculum

In [21]:
fig = activities.sort_month(
        activities.df[['last_edit_month','activity_school_year','in_curriculum']],
        'last_edit_month',['activity_school_year']
).hist(x="last_edit_month", y="in_curriculum", 
        facet_col='activity_school_year',
)

fig.update_layout(
    title="# of activities in curriculum per month",
    title_x=0.5,
    yaxis_title="# activities in curriculum",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)


fig.show()

Feedbacks

In [112]:
feedbacks['response_month'] = pd.DatetimeIndex(feedbacks['response_date']).month
feedbacks.replace({'response_month':month_map},inplace=True)
feedbacks['request_month'] = pd.DatetimeIndex(feedbacks['request_date']).month
feedbacks.replace({'request_month':month_map},inplace=True)
In [137]:
hist_requests = activities.sort_month(
    feedbacks.groupby(['activity_school_year','request_month']).size().reset_index(name="count"),
    'request_month',['activity_school_year']
).rename(columns={'request_month':'month'}).assign(type = 'request')

hist_responses = activities.sort_month(
    feedbacks.groupby(['activity_school_year','response_month']).size().reset_index(name="count"),
    'response_month',['activity_school_year']
).rename(columns={'response_month':'month'}).assign(type = 'response')

hist_data = hist_requests.append(hist_responses)
In [140]:
fig = hist_data.plot(
    x="month", y=["count"],facet_col='activity_school_year', color="type"
)

fig.update_layout(
    title="Total number of feedback requests and responses per month",
    title_x=0.5,
    yaxis_title="count",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [146]:
feedbacks['has_response'] = feedbacks['response_date'].notnull()
In [163]:
a = feedbacks.groupby(["request_month","activity_school_year"]).sum()['has_response'].div(
        feedbacks.groupby(["request_month","activity_school_year"]).size()
    ).reset_index(name="ratio")

total = feedbacks.groupby(["request_month","activity_school_year"]).size()
In [190]:
fig = activities.sort_month(
    feedbacks.groupby(["request_month","activity_school_year"]).sum()['has_response'].div(
        feedbacks.groupby(["request_month","activity_school_year"]).size()
    ).reset_index(name="ratio").assign(
        total = feedbacks.groupby(["request_month","activity_school_year"]).size().values.astype('int64')
    ),'request_month',['activity_school_year']
).plot.scatter(
    x="request_month", y=["ratio"],facet_col='activity_school_year', 
    size="total", 
    color="total",
    size_max=60
)

fig.update_layout(
    title="Ratio responses/requests",
    title_x=0.5,
    yaxis_title="ratio",
    showlegend = False
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces,line=dict(color="gray"))

fig.show()

Feedbacks

Ratio

In [265]:
tot_req = feedbacks.groupby(['recipient']).size()
tot_resp = feedbacks.query("has_response == True").groupby(['recipient']).size()
tot_resp = (tot_req-tot_req+b).fillna(0)
ratio = tot_resp.div(tot_req).reset_index(name="ratio").rename(columns={'recipient':'us_user'})
ratio = ratio.merge(supervisors[['us_user','start_year','user_name']],on="us_user",how="right").dropna()
tot_req = tot_req.reset_index(name="received").rename(columns={'recipient':'us_user'})
ratio = ratio.merge(tot_req,on="us_user",how="left")
ratio['responsed'] = ratio['ratio']*ratio['received']
ratio.sort_values(by="ratio",inplace=True)
In [266]:
fig = ratio.plot.scatter(x="us_user",y="ratio",color="received")
fig.update_layout(
    title="Ratio responses/requests per each supervisor", 
    title_x=0.5,
    xaxis_title='supervisor',
    xaxis_type='category'
)
fig.update_traces(mode='markers',opacity=0.8,)

fig.add_trace(ratio.plot.bar(x="us_user",y="ratio",color="received").data[0])

fig.update_xaxes(showticklabels=False)
fig.show()
In [286]:
ratio['norm'] = ratio['responsed']*ratio['ratio']**2
bests=ratio.sort_values(by=["norm"],ascending=False).head(10)
In [287]:
bests
Out[287]:
us_user ratio start_year user_name received responsed norm
43 336 0.806569 2014 Agustoni Loris 274 221.0 143.772457
82 663 0.721014 2016 Carlo Giovio 276 199.0 103.452518
118 500485 0.962617 2018 Mirjam Trinkler 107 103.0 95.443008
154 500994 0.986486 2019 René Studer 74 73.0 71.040358
107 500352 0.940299 2018 Alberto Fissore 67 63.0 55.702161
125 500720 0.850000 2019 Gabriel Leleu 80 68.0 49.130000
114 500424 0.840000 2018 Roberto Danesi 75 63.0 44.452800
93 500093 1.000000 2018 Tosi Loris 38 38.0 38.000000
126 500722 0.860465 2019 Antonio Bramante 43 37.0 27.394808
61 424 0.853659 2015 Roberto Danesi 41 35.0 25.505651

Edits and delay

In [293]:
df_feedbacks_info = feedbacks
In [294]:
no_answer = df_feedbacks_info['student_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without student grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 361/4739 without student grade (7.62%)
In [295]:
no_answer = df_feedbacks_info['response_date'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without answer ({round(no_answer/len(df_feedbacks_info),3)*100}%)')
There are 2774/4739 without answer (58.5%)
In [296]:
no_answer = df_feedbacks_info['supervisor_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without supervisor grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 2841/4739 without supervisor grade (59.95%)
In [297]:
response_no_grade = (df_feedbacks_info['response_date'].notnull() & df_feedbacks_info['supervisor_grade'].isnull()).sum()
grade_no_response = (df_feedbacks_info['response_date'].isnull() & df_feedbacks_info['supervisor_grade'].notnull()).sum()

print(f'There are {response_no_grade} responses without grades')
There are 151 responses without grades
In [298]:
grade_no_response #TODO: VERIFICARE QUESTO!
Out[298]:
84
In [299]:
df_feedbacks_info.dropna(inplace=True)
df_feedbacks_info['delay_days'] = df_feedbacks_info['delay_hours']/24
In [300]:
df_hist = df_feedbacks_info.groupby(["activity_school_year","times_before_answer"]).count()[['ac_activity']].reset_index().rename(columns={'ac_activity': 'count'})


fig = df_hist.plot.hist(x="times_before_answer", y='count', 
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    xaxis_type='category',
    title="Requests before a feedback",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="Number of requests before the response",
    coloraxis_showscale=False,
)

fig.show()
In [301]:
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days",  nbins=30,
                         color = "activity_school_year" )

fig.update_layout(
    barmode='group',
    title="Histogram: days before an answer",
    title_x=0.5,
    yaxis_title="Count",
    xaxis_title="days before an answer",
    coloraxis_showscale=False,
)

fig.show()
In [302]:
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days",  nbins=30,
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    title="Histogram: normalized number of days before an answer",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="days before an answer",
    coloraxis_showscale=False,
)

fig.show()
In [303]:
fig = df_feedbacks_info[df_feedbacks_info['edits_after']<5].plot.hist(x="edits_after",  nbins=5,
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    title="Histogram: normalized number of edits after a feedback",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="# of edits after an answer per feedback request",
)

fig.show()
In [304]:
df_feedbacks_info['has_edit_after'] = (df_feedbacks_info['edits_after']>0).astype(int)
In [305]:
df_count = df_feedbacks_info.groupby(["activity_school_year","supervisor_grade","has_edit_after"]).size().reset_index(name='count')

c = (pd.core.reshape.util.cartesian_product([df_count['activity_school_year'].unique(),df_count['supervisor_grade'].unique(),df_count['has_edit_after'].unique()]))
c = pd.DataFrame(dict(activity_school_year=c[0],supervisor_grade=c[1],has_edit_after=c[2]))
c = c.set_index(["activity_school_year","supervisor_grade","has_edit_after"]).join(df_count.set_index(["activity_school_year","supervisor_grade","has_edit_after"])).reset_index().sort_values(by=['activity_school_year','supervisor_grade'])
c.fillna(0,inplace=True)

df_feedback_edits =  c[c['has_edit_after']==True].drop("has_edit_after",axis=1).rename(columns={'count':'edit'})
df_feedback_edits['no_edit'] = c[c['has_edit_after']==False]['count'].tolist()

df_feedback_edits['ratio'] = df_feedback_edits['edit'].div(df_feedback_edits['no_edit']+df_feedback_edits['edit']).fillna(0)
In [306]:
fig = df_feedback_edits.plot(x="supervisor_grade", y=["edit"],
                             line_group='activity_school_year', color='activity_school_year')
fig.update_layout(
    title="# activities that have been edit after a feedback per grade",
    title_x=0.5,
    yaxis_title="# activities",
    xaxis_title="supervisor grade",
)
fig.update_traces(line_traces)

fig.show()

Logins

In [86]:
dayofweek_map = {1:'Sun',2:'Mon',3:'Tue',4:'Wed',5:'Thu',6:'Fri',7:'Sat'}

logins_students = logins.merge(students,on="us_user",how="inner")
logins_supervisors = logins.merge(supervisors,on="us_user",how="inner")
logins_teachers = logins.merge(teachers,on="us_user",how="inner")

logins['date'] = logins['date'].astype('datetime64')
In [87]:
fig = logins_students.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
    {'dayofweek':dayofweek_map}
).plot(
    x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)

fig.update_layout(
    title="Apprentices logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)

fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()
In [90]:
fig = logins_supervisors.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
    {'dayofweek':dayofweek_map}
).plot(
    x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)

fig.update_layout(
    title="Supervisors logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)

fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()
In [91]:
fig = logins_teachers.groupby(['dayofweek','hour']).size().reset_index(name="count").replace(
    {'dayofweek':dayofweek_map}
).plot(
    x="hour", y=["count"],facet_col='dayofweek', color="dayofweek"
)

fig.update_layout(
    title="Teachers logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)

fig.update_xaxes(title=dict(text=""))
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()